3ddb79bc1_2bAt67x9MFCP4AZrQnvQ xen/arch/x86/process.c
3ddb79bc7KxGCEJsgBnkDX7XjD_ZEQ xen/arch/x86/rwlock.c
3ddb79bcrD6Z_rUvSDgrvjyb4846Eg xen/arch/x86/setup.c
+405b8599xI_PoEr3zZoJ2on-jdn7iw xen/arch/x86/shadow.c
3ddb79bcSx2e8JSR3pdSGa8x1ScYzA xen/arch/x86/smp.c
3ddb79bcfUN3-UBCPzX26IU8bq-3aw xen/arch/x86/smpboot.c
3ddb79bc-Udq7ol-NX4q9XsYnN7A2Q xen/arch/x86/time.c
40589968dD2D1aejwSOvrROg7fOvGQ xen/common/sched_bvt.c
40589968be_t_n0-w6ggceW7h-sx0w xen/common/sched_rrobin.c
3e397e6619PgAfBbw2XFbXkewvUWgw xen/common/schedule.c
-405b8599xI_PoEr3zZoJ2on-jdn7iw xen/common/shadow.c
3ddb79bdB9RNMnkQnUyZ5C9hhMSQQw xen/common/slab.c
3ddb79bd0gVQYmL2zvuJnldvD0AGxQ xen/common/softirq.c
3e7f358awXBC3Vw-wFRwPw18qL1khg xen/common/string.c
3ddb79c2QF5-pZGzuX4QukPCDAl59A xen/include/asm-x86/processor.h
40cf1596bim9F9DNdV75klgRSZ6Y2A xen/include/asm-x86/ptrace.h
3ddb79c2plf7ciNgoNjU-RsbUzawsw xen/include/asm-x86/rwlock.h
+405b8599BsDsDwKEJLS0XipaiQW3TA xen/include/asm-x86/shadow.h
3ddb79c3Hgbb2g8CyWLMCK-6_ZVQSQ xen/include/asm-x86/smp.h
3ddb79c3jn8ALV_S9W5aeTYUQRKBpg xen/include/asm-x86/smpboot.h
3ddb79c3NiyQE2vQnyGiaBnNjBO1rA xen/include/asm-x86/spinlock.h
40589969nPq3DMzv24RDb5LXE9brHw xen/include/xen/sched-if.h
3ddb79c0LzqqS0LhAQ50ekgj4oGl7Q xen/include/xen/sched.h
403a06a7H0hpHcKpAiDe5BPnaXWTlA xen/include/xen/serial.h
-405b8599BsDsDwKEJLS0XipaiQW3TA xen/include/xen/shadow.h
+40e3392dib7GrcBAu5cT-EUZTYzeEQ xen/include/xen/shadow.h
3ddb79c14dXIhP7C2ahnoD08K90G_w xen/include/xen/slab.h
3ddb79c09xbS-xxfKxuV3JETIhBzmg xen/include/xen/smp.h
3ddb79c1Vi5VleJAOKHAlY0G2zAsgw xen/include/xen/softirq.h
br260@br260.wolfson.cam.ac.uk
br260@labyrinth.cl.cam.ac.uk
br260@laudney.cl.cam.ac.uk
+djm@kirby.fc.hp.com
gm281@boulderdash.cl.cam.ac.uk
iap10@freefall.cl.cam.ac.uk
iap10@labyrinth.cl.cam.ac.uk
--- /dev/null
+/* -*- Mode:C++; c-file-style:BSD; c-basic-offset:4; tab-width:4 -*- */
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/mm.h>
+#include <xen/shadow.h>
+#include <asm/domain_page.h>
+#include <asm/page.h>
+#include <xen/event.h>
+#include <xen/trace.h>
+
+
+/********
+
+To use these shadow page tables, guests must not rely on the ACCESSED
+and DIRTY bits on L2 pte's being accurate -- they will typically all be set.
+
+I doubt this will break anything. (If guests want to use the va_update
+mechanism they've signed up for this anyhow...)
+
+There's a per-domain shadow table spin lock which works fine for SMP
+hosts. We don't have to worry about interrupts as no shadow operations
+happen in an interrupt context. It's probably not quite ready for SMP
+guest operation as we have to worry about synchonisation between gpte
+and spte updates. Its possible that this might only happen in a
+hypercall context, in which case we'll probably at have a per-domain
+hypercall lock anyhow (at least initially).
+
+********/
+
+
+/**
+
+FIXME:
+
+The shadow table flush command is dangerous on SMP systems as the
+guest may be using the L2 on one CPU while the other is trying to
+blow the table away.
+
+The current save restore code works around this by not calling FLUSH,
+but by calling CLEAN2 which leaves all L2s in tact (this is probably
+quicker anyhow).
+
+Even so, we have to be very careful. The flush code may need to cause
+a TLB flush on another CPU. It needs to do this while holding the
+shadow table lock. The trouble is, the guest may be in the shadow page
+fault handler spinning waiting to grab the shadow lock. It may have
+intterupts disabled, hence we can't use the normal flush_tlb_cpu
+mechanism.
+
+For the moment, we have a grim race whereby the spinlock in the shadow
+fault handler is actually a try lock, in a loop with a helper for the
+tlb flush code.
+
+A better soloution would be to take a new flush lock, then raise a
+per-domain soft irq on the other CPU. The softirq will switch to
+init's PTs, then do an atomic inc of a variable to count himself in,
+then spin on a lock. Having noticed that the other guy has counted
+in, flush the shadow table, then release him by dropping the lock. He
+will then reload cr3 from mm.page_table on the way out of the softirq.
+
+In domian-softirq context we know that the guy holds no locks and has
+interrupts enabled. Nothing can go wrong ;-)
+
+**/
+
+static inline void free_shadow_page( struct mm_struct *m,
+ struct pfn_info *pfn_info )
+{
+ unsigned long flags;
+ unsigned long type = pfn_info->type_and_flags & PGT_type_mask;
+
+ m->shadow_page_count--;
+
+ if (type == PGT_l1_page_table)
+ perfc_decr(shadow_l1_pages);
+ else if (type == PGT_l2_page_table)
+ perfc_decr(shadow_l2_pages);
+ else printk("Free shadow weird page type pfn=%08x type=%08x\n",
+ frame_table-pfn_info, pfn_info->type_and_flags);
+
+ pfn_info->type_and_flags = 0;
+
+ spin_lock_irqsave(&free_list_lock, flags);
+ list_add(&pfn_info->list, &free_list);
+ free_pfns++;
+ spin_unlock_irqrestore(&free_list_lock, flags);
+}
+
+static void __free_shadow_table( struct mm_struct *m )
+{
+ int j, free=0;
+ struct shadow_status *a,*next;
+
+ // the code assumes you're not using the page tables i.e.
+ // the domain is stopped and cr3 is something else!!
+
+ // walk the hash table and call free_shadow_page on all pages
+
+ shadow_audit(m,1);
+
+ for(j=0;j<shadow_ht_buckets;j++)
+ {
+ a = &m->shadow_ht[j];
+ if (a->pfn)
+ {
+ free_shadow_page( m,
+ &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
+ a->pfn = 0;
+ a->spfn_and_flags = 0;
+ free++;
+ }
+ next=a->next;
+ a->next=NULL;
+ a=next;
+ while(a)
+ {
+ struct shadow_status *next = a->next;
+
+ free_shadow_page( m,
+ &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
+ a->pfn = 0;
+ a->spfn_and_flags = 0;
+ free++;
+ a->next = m->shadow_ht_free;
+ m->shadow_ht_free = a;
+ a=next;
+ }
+ shadow_audit(m,0);
+ }
+ SH_LOG("Free shadow table. Freed= %d",free);
+}
+
+
+#define TABLE_OP_ZERO_L2 1
+#define TABLE_OP_ZERO_L1 2
+#define TABLE_OP_FREE_L1 3
+
+static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
+ unsigned int gpfn,
+ struct pfn_info *spfn_info, int *work )
+{
+ unsigned int spfn = spfn_info-frame_table;
+ int restart = 0;
+
+ switch( op )
+ {
+ case TABLE_OP_ZERO_L2:
+ {
+ if ( (spfn_info->type_and_flags & PGT_type_mask) ==
+ PGT_l2_page_table )
+ {
+ unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
+#ifdef __i386__
+ memset(spl1e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*spl1e));
+#endif
+ unmap_domain_mem( spl1e );
+ }
+ }
+ break;
+
+ case TABLE_OP_ZERO_L1:
+ {
+ if ( (spfn_info->type_and_flags & PGT_type_mask) ==
+ PGT_l1_page_table )
+ {
+ unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
+ memset( spl1e, 0, ENTRIES_PER_L1_PAGETABLE * sizeof(*spl1e) );
+ unmap_domain_mem( spl1e );
+ }
+ }
+ break;
+
+ case TABLE_OP_FREE_L1:
+ {
+ if ( (spfn_info->type_and_flags & PGT_type_mask) ==
+ PGT_l1_page_table )
+ {
+ // lock is already held
+ delete_shadow_status( m, gpfn );
+ restart = 1; // we need to go to start of list again
+ }
+ }
+
+ break;
+
+ default:
+ BUG();
+
+ }
+ return restart;
+}
+
+static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
+{
+ int j, work=0;
+ struct shadow_status *a, *next;
+
+ // the code assumes you're not using the page tables i.e.
+ // the domain is stopped and cr3 is something else!!
+
+ // walk the hash table and call free_shadow_page on all pages
+
+ shadow_audit(m,1);
+
+ for(j=0;j<shadow_ht_buckets;j++)
+ {
+ retry:
+ a = &m->shadow_ht[j];
+ next = a->next;
+ if (a->pfn)
+ {
+ if ( shadow_page_op( m, op, a->pfn,
+ &frame_table[a->spfn_and_flags & PSH_pfn_mask],
+ &work ) )
+ goto retry;
+ }
+ a=next;
+ while(a)
+ {
+ next = a->next;
+ if ( shadow_page_op( m, op, a->pfn,
+ &frame_table[a->spfn_and_flags & PSH_pfn_mask],
+ &work ) )
+ goto retry;
+ a=next;
+ }
+ shadow_audit(m,0);
+ }
+ SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
+}
+
+
+void shadow_mode_init(void)
+{
+}
+
+int shadow_mode_enable( struct domain *p, unsigned int mode )
+{
+ struct mm_struct *m = &p->mm;
+ struct shadow_status **fptr;
+ int i;
+
+ m->shadow_mode = mode;
+
+ // allocate hashtable
+ m->shadow_ht = kmalloc(shadow_ht_buckets *
+ sizeof(struct shadow_status));
+ if( m->shadow_ht == NULL )
+ goto nomem;
+
+ memset(m->shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status));
+
+ // allocate space for first lot of extra nodes
+ m->shadow_ht_extras = kmalloc(sizeof(void*) +
+ (shadow_ht_extra_size *
+ sizeof(struct shadow_status)));
+ if( m->shadow_ht_extras == NULL )
+ goto nomem;
+
+ memset( m->shadow_ht_extras, 0, sizeof(void*) + (shadow_ht_extra_size *
+ sizeof(struct shadow_status)) );
+
+ m->shadow_extras_count++;
+
+ // add extras to free list
+ fptr = &m->shadow_ht_free;
+ for ( i=0; i<shadow_ht_extra_size; i++ )
+ {
+ *fptr = &m->shadow_ht_extras[i];
+ fptr = &(m->shadow_ht_extras[i].next);
+ }
+ *fptr = NULL;
+ *((struct shadow_status ** )
+ &m->shadow_ht_extras[shadow_ht_extra_size]) = NULL;
+
+ if ( mode == SHM_logdirty )
+ {
+ m->shadow_dirty_bitmap_size = (p->max_pages+63)&(~63);
+ m->shadow_dirty_bitmap =
+ kmalloc( m->shadow_dirty_bitmap_size/8);
+ if( m->shadow_dirty_bitmap == NULL )
+ {
+ m->shadow_dirty_bitmap_size = 0;
+ goto nomem;
+ }
+ memset(m->shadow_dirty_bitmap,0,m->shadow_dirty_bitmap_size/8);
+ }
+
+ // call shadow_mk_pagetable
+ __shadow_mk_pagetable( m );
+ return 0;
+
+nomem:
+ return -ENOMEM;
+}
+
+void shadow_mode_disable( struct domain *p )
+{
+ struct mm_struct *m = &p->mm;
+ struct shadow_status *next;
+
+ __free_shadow_table( m );
+ m->shadow_mode = 0;
+
+ SH_LOG("freed tables count=%d l1=%d l2=%d",
+ m->shadow_page_count, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
+
+ next = m->shadow_ht_extras;
+ while( next )
+ {
+ struct shadow_status * this = next;
+ m->shadow_extras_count--;
+ next = *((struct shadow_status **)(&next[shadow_ht_extra_size]));
+ kfree( this );
+ }
+
+ SH_LOG("freed extras, now %d", m->shadow_extras_count);
+
+ if( m->shadow_dirty_bitmap )
+ {
+ kfree( m->shadow_dirty_bitmap );
+ m->shadow_dirty_bitmap = 0;
+ m->shadow_dirty_bitmap_size = 0;
+ }
+
+ // free the hashtable itself
+ kfree( &m->shadow_ht[0] );
+}
+
+static int shadow_mode_table_op(struct domain *d,
+ dom0_shadow_control_t *sc)
+{
+ unsigned int op = sc->op;
+ struct mm_struct *m = &d->mm;
+ int rc = 0;
+
+ // since Dom0 did the hypercall, we should be running with it's page
+ // tables right now. Calling flush on yourself would be really
+ // stupid.
+
+ ASSERT(spin_is_locked(&d->mm.shadow_lock));
+
+ if ( m == ¤t->mm )
+ {
+ printk("Don't try and flush your own page tables!\n");
+ return -EINVAL;
+ }
+
+ SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
+
+ shadow_audit(m,1);
+
+ switch(op)
+ {
+ case DOM0_SHADOW_CONTROL_OP_FLUSH:
+ // XXX THIS IS VERY DANGEROUS : MUST ENSURE THE PTs ARE NOT IN USE ON
+ // OTHER CPU -- fix when we get sched sync pause.
+ __free_shadow_table( m );
+ break;
+
+ case DOM0_SHADOW_CONTROL_OP_CLEAN: // zero all-non hypervisor
+ {
+ __scan_shadow_table( m, TABLE_OP_ZERO_L2 );
+ __scan_shadow_table( m, TABLE_OP_ZERO_L1 );
+
+ goto send_bitmap;
+ }
+
+
+ case DOM0_SHADOW_CONTROL_OP_CLEAN2: // zero all L2, free L1s
+ {
+ int i,j,zero=1;
+
+ __scan_shadow_table( m, TABLE_OP_ZERO_L2 );
+ __scan_shadow_table( m, TABLE_OP_FREE_L1 );
+
+ send_bitmap:
+ sc->stats.fault_count = d->mm.shadow_fault_count;
+ sc->stats.dirty_count = d->mm.shadow_dirty_count;
+ sc->stats.dirty_net_count = d->mm.shadow_dirty_net_count;
+ sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
+
+ d->mm.shadow_fault_count = 0;
+ d->mm.shadow_dirty_count = 0;
+ d->mm.shadow_dirty_net_count = 0;
+ d->mm.shadow_dirty_block_count = 0;
+
+ sc->pages = d->tot_pages;
+
+ if( d->tot_pages > sc->pages ||
+ !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
+ {
+ rc = -EINVAL;
+ goto out;
+ }
+
+
+#define chunk (8*1024) // do this in 1KB chunks for L1 cache
+
+ for(i=0;i<d->tot_pages;i+=chunk)
+ {
+ int bytes = (( ((d->tot_pages-i) > (chunk))?
+ (chunk):(d->tot_pages-i) ) + 7) / 8;
+
+ copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+ bytes );
+
+ for(j=0; zero && j<bytes/sizeof(unsigned long);j++)
+ {
+ if( d->mm.shadow_dirty_bitmap[j] != 0 )
+ zero = 0;
+ }
+
+ memset( d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+ 0, bytes);
+ }
+
+ /* Might as well stop the domain as an optimization. */
+ if ( zero )
+ domain_pause_by_systemcontroller(d);
+
+ break;
+ }
+
+ case DOM0_SHADOW_CONTROL_OP_PEEK:
+ {
+ int i;
+
+ sc->stats.fault_count = d->mm.shadow_fault_count;
+ sc->stats.dirty_count = d->mm.shadow_dirty_count;
+ sc->stats.dirty_net_count = d->mm.shadow_dirty_net_count;
+ sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
+
+ if( d->tot_pages > sc->pages ||
+ !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
+ {
+ rc = -EINVAL;
+ goto out;
+ }
+
+ sc->pages = d->tot_pages;
+
+#define chunk (8*1024) // do this in 1KB chunks for L1 cache
+
+ for(i=0;i<d->tot_pages;i+=chunk)
+ {
+ int bytes = (( ((d->tot_pages-i) > (chunk))?
+ (chunk):(d->tot_pages-i) ) + 7) / 8;
+
+ copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
+ d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
+ bytes );
+ }
+
+ break;
+ }
+
+ default:
+ BUG();
+
+ }
+
+
+out:
+
+ SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count);
+
+ shadow_audit(m,1);
+
+ // call shadow_mk_pagetable
+ __shadow_mk_pagetable( m );
+
+ return rc;
+}
+
+int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc )
+{
+ unsigned int cmd = sc->op;
+ int rc = 0;
+
+ spin_lock(&p->mm.shadow_lock);
+
+ if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
+ {
+ shadow_mode_disable(p);
+ }
+ else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
+ {
+ if(p->mm.shadow_mode) shadow_mode_disable(p);
+ shadow_mode_enable(p, SHM_test);
+ }
+ else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY )
+ {
+ if(p->mm.shadow_mode) shadow_mode_disable(p);
+ shadow_mode_enable(p, SHM_logdirty);
+ }
+ else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN2 )
+ {
+ rc = shadow_mode_table_op(p, sc);
+ }
+ else
+ {
+ rc = -EINVAL;
+ }
+
+ flush_tlb_cpu(p->processor);
+
+ spin_unlock(&p->mm.shadow_lock);
+
+ return rc;
+}
+
+
+
+static inline struct pfn_info *alloc_shadow_page( struct mm_struct *m )
+{
+ m->shadow_page_count++;
+
+ return alloc_domain_page( NULL );
+}
+
+
+void unshadow_table( unsigned long gpfn, unsigned int type )
+{
+ unsigned long spfn;
+
+ SH_VLOG("unshadow_table type=%08x gpfn=%08lx",
+ type,
+ gpfn );
+
+ perfc_incrc(unshadow_table_count);
+
+ // this function is the same for both l1 and l2 tables
+
+ // even in the SMP guest case, there won't be a race here as
+ // this CPU was the one that cmpxchg'ed the page to invalid
+
+ spfn = __shadow_status(¤t->mm, gpfn) & PSH_pfn_mask;
+
+ delete_shadow_status(¤t->mm, gpfn);
+
+ free_shadow_page( ¤t->mm, &frame_table[spfn] );
+
+}
+
+
+unsigned long shadow_l2_table(
+ struct mm_struct *m, unsigned long gpfn )
+{
+ struct pfn_info *spfn_info;
+ unsigned long spfn;
+ l2_pgentry_t *spl2e, *gpl2e;
+ int i;
+
+ SH_VVLOG("shadow_l2_table( %08lx )",gpfn);
+
+ perfc_incrc(shadow_l2_table_count);
+
+ // XXX in future, worry about racing in SMP guests
+ // -- use cmpxchg with PSH_pending flag to show progress (and spin)
+
+ spfn_info = alloc_shadow_page(m);
+
+ ASSERT( spfn_info ); // XXX deal with failure later e.g. blow cache
+
+ spfn_info->type_and_flags = PGT_l2_page_table;
+ perfc_incr(shadow_l2_pages);
+
+ spfn = (unsigned long) (spfn_info - frame_table);
+
+ // mark pfn as being shadowed, update field to point at shadow
+ set_shadow_status(m, gpfn, spfn | PSH_shadowed);
+
+ // we need to do this before the linear map is set up
+ spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT);
+
+#ifdef __i386__
+ // get hypervisor and 2x linear PT mapings installed
+ memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
+ spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
+ spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
+ mk_l2_pgentry(__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) |
+ __PAGE_HYPERVISOR);
+#endif
+
+ // can't use the linear map as we may not be in the right PT
+ gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
+
+ // proactively create entries for pages that are already shadowed
+ for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ {
+ unsigned long spte = 0;
+
+#if 0 // Turns out this doesn't really help
+ unsigned long gpte;
+
+ gpte = l2_pgentry_val(gpl2e[i]);
+
+ if (gpte & _PAGE_PRESENT)
+ {
+ unsigned long s_sh =
+ __shadow_status(p, gpte>>PAGE_SHIFT);
+
+ l2pde_general( m, &gpte, &spte, s_sh );
+
+ }
+#endif
+
+ spl2e[i] = mk_l2_pgentry( spte );
+
+ }
+
+ // its arguable we should 'preemptively shadow' a few active L1 pages
+ // to avoid taking a string of faults when 'jacking' a running domain
+
+ unmap_domain_mem( gpl2e );
+ unmap_domain_mem( spl2e );
+
+ SH_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn);
+
+ return spfn;
+}
+
+
+int shadow_fault( unsigned long va, long error_code )
+{
+ unsigned long gpte, spte;
+ struct mm_struct *m = ¤t->mm;
+
+ SH_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code );
+
+ check_pagetable( current, current->mm.pagetable, "pre-sf" );
+
+ if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
+ {
+ SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
+ return 0; // propagate to guest
+ }
+
+ if ( ! (gpte & _PAGE_PRESENT) )
+ {
+ SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
+ return 0; // we're not going to be able to help
+ }
+
+ if ( (error_code & 2) && ! (gpte & _PAGE_RW) )
+ {
+ // write fault on RO page
+ return 0;
+ }
+
+ // take the lock and reread gpte
+
+ while( unlikely(!spin_trylock(¤t->mm.shadow_lock)) )
+ {
+ extern volatile unsigned long flush_cpumask;
+ if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
+ local_flush_tlb();
+ rep_nop();
+ }
+
+ ASSERT(spin_is_locked(¤t->mm.shadow_lock));
+
+ if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
+ {
+ SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
+ spin_unlock(&m->shadow_lock);
+ return 0; // propagate to guest
+ }
+
+ if ( unlikely(!(gpte & _PAGE_PRESENT)) )
+ {
+ SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
+ spin_unlock(&m->shadow_lock);
+ return 0; // we're not going to be able to help
+ }
+
+ if ( error_code & 2 )
+ { // write fault
+ if ( likely(gpte & _PAGE_RW) )
+ {
+ l1pte_write_fault( m, &gpte, &spte );
+ }
+ else
+ { // write fault on RO page
+ SH_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte );
+ spin_unlock(&m->shadow_lock);
+ return 0; // propagate to guest
+ // not clear whether we should set accessed bit here...
+ }
+ }
+ else
+ {
+ l1pte_read_fault( m, &gpte, &spte );
+ }
+
+ SH_VVLOG("plan: gpte=%08lx spte=%08lx", gpte, spte );
+
+ // write back updated gpte
+ // XXX watch out for read-only L2 entries! (not used in Linux)
+ if ( unlikely( __put_user( gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
+ BUG(); // fixme!
+
+ if ( unlikely( __put_user( spte, (unsigned long*)&shadow_linear_pg_table[va>>PAGE_SHIFT])) )
+ {
+ // failed:
+ // the L1 may not be shadowed, or the L2 entry may be insufficient
+
+ unsigned long gpde, spde, gl1pfn, sl1pfn;
+
+ SH_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx spte=%08lx",gpte,spte );
+
+ gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]);
+
+ gl1pfn = gpde>>PAGE_SHIFT;
+
+
+ if ( ! (sl1pfn=__shadow_status(¤t->mm, gl1pfn) ) )
+ {
+ // this L1 is NOT already shadowed so we need to shadow it
+ struct pfn_info *sl1pfn_info;
+ unsigned long *gpl1e, *spl1e;
+ int i;
+ sl1pfn_info = alloc_shadow_page( ¤t->mm );
+ sl1pfn_info->type_and_flags = PGT_l1_page_table;
+
+ sl1pfn = sl1pfn_info - frame_table;
+
+ SH_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn);
+ perfc_incrc(shadow_l1_table_count);
+ perfc_incr(shadow_l1_pages);
+
+ set_shadow_status(¤t->mm, gl1pfn, PSH_shadowed | sl1pfn);
+
+ l2pde_general( m, &gpde, &spde, sl1pfn );
+
+ linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
+ shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
+
+ gpl1e = (unsigned long *) &(linear_pg_table[
+ (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]);
+
+ spl1e = (unsigned long *) &shadow_linear_pg_table[
+ (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ];
+
+
+ for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
+ {
+ l1pte_no_fault( m, &gpl1e[i], &spl1e[i] );
+ }
+
+
+ }
+ else
+ {
+ // this L1 was shadowed (by another PT) but we didn't have an L2
+ // entry for it
+
+ SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )",sl1pfn);
+
+ l2pde_general( m, &gpde, &spde, sl1pfn );
+
+ linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
+ shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
+
+ }
+
+ shadow_linear_pg_table[va>>PAGE_SHIFT] = mk_l1_pgentry(spte);
+ // (we need to do the above even if we've just made the shadow L1)
+
+ } // end of fixup writing the shadow L1 directly failed
+
+ perfc_incrc(shadow_fixup_count);
+
+ m->shadow_fault_count++;
+
+ check_pagetable( current, current->mm.pagetable, "post-sf" );
+
+ spin_unlock(&m->shadow_lock);
+
+ return 1; // let's try the faulting instruction again...
+
+}
+
+
+void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
+ unsigned long *prev_spfn_ptr,
+ l1_pgentry_t **prev_spl1e_ptr )
+{
+ unsigned long gpfn, spfn, spte, prev_spfn = *prev_spfn_ptr;
+ l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr;
+
+
+ SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%p\n",
+ pa,gpte,prev_spfn, prev_spl1e);
+
+ // to get here, we know the l1 page *must* be shadowed
+
+ gpfn = pa >> PAGE_SHIFT;
+ spfn = __shadow_status(¤t->mm, gpfn) & PSH_pfn_mask;
+
+ if ( spfn == prev_spfn )
+ {
+ spl1e = prev_spl1e;
+ }
+ else
+ {
+ if( prev_spl1e ) unmap_domain_mem( prev_spl1e );
+ spl1e = (l1_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
+ *prev_spfn_ptr = spfn;
+ *prev_spl1e_ptr = spl1e;
+ }
+
+ // XXX we assume only pagetables can be shadowed;
+ // this will have to change to allow arbitrary CoW etc.
+
+ l1pte_no_fault( ¤t->mm, &gpte, &spte );
+
+
+ spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t) ] = mk_l1_pgentry( spte );
+
+}
+
+void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte )
+{
+ unsigned long gpfn, spfn, spte;
+ l2_pgentry_t * sp2le;
+ unsigned long s_sh=0;
+
+ SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte);
+
+ // to get here, we know the l2 page has a shadow
+
+ gpfn = pa >> PAGE_SHIFT;
+ spfn = __shadow_status(¤t->mm, gpfn) & PSH_pfn_mask;
+
+
+ spte = 0;
+
+ if( gpte & _PAGE_PRESENT )
+ s_sh = __shadow_status(¤t->mm, gpte >> PAGE_SHIFT);
+
+ sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
+ // no real need for a cache here
+
+ l2pde_general( ¤t->mm, &gpte, &spte, s_sh );
+
+ // XXXX Should mark guest pte as DIRTY and ACCESSED too!!!!!
+
+ sp2le[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t) ] =
+ mk_l2_pgentry( spte );
+
+ unmap_domain_mem( (void *) sp2le );
+}
+
+
+#if SHADOW_DEBUG
+
+static int sh_l2_present;
+static int sh_l1_present;
+char * sh_check_name;
+
+#define FAIL(_f, _a...) \
+{printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n", sh_check_name, level, i, ## _a , gpte, spte ); BUG();}
+
+static int check_pte( struct mm_struct *m,
+ unsigned long gpte, unsigned long spte, int level, int i )
+{
+ unsigned long mask, gpfn, spfn;
+
+ if ( spte == 0 || spte == 0xdeadface || spte == 0x00000E00)
+ return 1; // always safe
+
+ if ( !(spte & _PAGE_PRESENT) )
+ FAIL("Non zero not present spte");
+
+ if( level == 2 ) sh_l2_present++;
+ if( level == 1 ) sh_l1_present++;
+
+ if ( !(gpte & _PAGE_PRESENT) )
+ FAIL("Guest not present yet shadow is");
+
+ mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000);
+
+ if ( (spte & mask) != (gpte & mask ) )
+ FAIL("Corrupt?");
+
+ if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
+ FAIL("Dirty coherence");
+
+ if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
+ FAIL("Accessed coherence");
+
+ if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
+ FAIL("RW coherence");
+
+ if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY) ))
+ FAIL("RW2 coherence");
+
+ spfn = spte>>PAGE_SHIFT;
+ gpfn = gpte>>PAGE_SHIFT;
+
+ if ( gpfn == spfn )
+ {
+ if ( level > 1 )
+ FAIL("Linear map ???"); // XXX this will fail on BSD
+
+ return 1;
+ }
+ else
+ {
+ if ( level < 2 )
+ FAIL("Shadow in L1 entry?");
+
+ if ( __shadow_status(p, gpfn) != (PSH_shadowed | spfn) )
+ FAIL("spfn problem g.sf=%08lx",
+ __shadow_status(p, gpfn) );
+ }
+
+ return 1;
+}
+
+
+static int check_l1_table( struct mm_struct *m, unsigned long va,
+ unsigned long g2, unsigned long s2 )
+{
+ int j;
+ unsigned long *gpl1e, *spl1e;
+
+ //gpl1e = (unsigned long *) &(linear_pg_table[ va>>PAGE_SHIFT]);
+ //spl1e = (unsigned long *) &(shadow_linear_pg_table[ va>>PAGE_SHIFT]);
+
+ gpl1e = map_domain_mem( g2<<PAGE_SHIFT );
+ spl1e = map_domain_mem( s2<<PAGE_SHIFT );
+
+ for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ )
+ {
+ unsigned long gpte = gpl1e[j];
+ unsigned long spte = spl1e[j];
+
+ check_pte( p, gpte, spte, 1, j );
+ }
+
+ unmap_domain_mem( spl1e );
+ unmap_domain_mem( gpl1e );
+
+ return 1;
+}
+
+#define FAILPT(_f, _a...) \
+{printk("XXX FAIL %s-PT" _f "\n", s, ## _a ); BUG();}
+
+int check_pagetable( struct mm_struct *m, pagetable_t pt, char *s )
+{
+ unsigned long gptbase = pagetable_val(pt);
+ unsigned long gpfn, spfn;
+ int i;
+ l2_pgentry_t *gpl2e, *spl2e;
+
+ sh_check_name = s;
+
+ SH_VVLOG("%s-PT Audit",s);
+
+ sh_l2_present = sh_l1_present = 0;
+
+ gpfn = gptbase >> PAGE_SHIFT;
+
+ if ( ! (__shadow_status(p, gpfn) & PSH_shadowed) )
+ {
+ printk("%s-PT %08lx not shadowed\n", s, gptbase);
+
+ if( __shadow_status(p, gpfn) != 0 ) BUG();
+
+ return 0;
+ }
+
+ spfn = __shadow_status(p, gpfn) & PSH_pfn_mask;
+
+ if ( ! __shadow_status(p, gpfn) == (PSH_shadowed | spfn) )
+ FAILPT("ptbase shadow inconsistent1");
+
+ gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
+ spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
+
+ //ipl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
+
+
+ if ( memcmp( &spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
+ ((SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT))-DOMAIN_ENTRIES_PER_L2_PAGETABLE)
+ * sizeof(l2_pgentry_t)) )
+ {
+ printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn);
+ for (i=DOMAIN_ENTRIES_PER_L2_PAGETABLE;
+ i<(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT));
+ i++ )
+ printk("+++ (%d) %08lx %08lx\n",i,
+ l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]) );
+ FAILPT("hypervisor entries inconsistent");
+ }
+
+ if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
+ l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
+ FAILPT("hypervisor linear map inconsistent");
+
+ if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
+ ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
+ FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx",
+ l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]),
+ (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR
+ );
+
+ if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
+ ((__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR))) )
+ FAILPT("hypervisor per-domain map inconsistent");
+
+
+ // check the whole L2
+ for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ {
+ unsigned long gpte = l2_pgentry_val(gpl2e[i]);
+ unsigned long spte = l2_pgentry_val(spl2e[i]);
+
+ check_pte( p, gpte, spte, 2, i );
+ }
+
+
+ // go back and recurse
+ for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
+ {
+ unsigned long gpte = l2_pgentry_val(gpl2e[i]);
+ unsigned long spte = l2_pgentry_val(spl2e[i]);
+
+ if ( spte )
+ check_l1_table( p,
+ i<<L2_PAGETABLE_SHIFT,
+ gpte>>PAGE_SHIFT, spte>>PAGE_SHIFT );
+
+ }
+
+ unmap_domain_mem( spl2e );
+ unmap_domain_mem( gpl2e );
+
+ SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n",
+ sh_l2_present, sh_l1_present );
+
+ return 1;
+}
+
+
+#endif
+++ /dev/null
-/* -*- Mode:C++; c-file-style:BSD; c-basic-offset:4; tab-width:4 -*- */
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/mm.h>
-#include <xen/shadow.h>
-#include <asm/domain_page.h>
-#include <asm/page.h>
-#include <xen/event.h>
-#include <xen/trace.h>
-
-
-/********
-
-To use these shadow page tables, guests must not rely on the ACCESSED
-and DIRTY bits on L2 pte's being accurate -- they will typically all be set.
-
-I doubt this will break anything. (If guests want to use the va_update
-mechanism they've signed up for this anyhow...)
-
-There's a per-domain shadow table spin lock which works fine for SMP
-hosts. We don't have to worry about interrupts as no shadow operations
-happen in an interrupt context. It's probably not quite ready for SMP
-guest operation as we have to worry about synchonisation between gpte
-and spte updates. Its possible that this might only happen in a
-hypercall context, in which case we'll probably at have a per-domain
-hypercall lock anyhow (at least initially).
-
-********/
-
-
-/**
-
-FIXME:
-
-The shadow table flush command is dangerous on SMP systems as the
-guest may be using the L2 on one CPU while the other is trying to
-blow the table away.
-
-The current save restore code works around this by not calling FLUSH,
-but by calling CLEAN2 which leaves all L2s in tact (this is probably
-quicker anyhow).
-
-Even so, we have to be very careful. The flush code may need to cause
-a TLB flush on another CPU. It needs to do this while holding the
-shadow table lock. The trouble is, the guest may be in the shadow page
-fault handler spinning waiting to grab the shadow lock. It may have
-intterupts disabled, hence we can't use the normal flush_tlb_cpu
-mechanism.
-
-For the moment, we have a grim race whereby the spinlock in the shadow
-fault handler is actually a try lock, in a loop with a helper for the
-tlb flush code.
-
-A better soloution would be to take a new flush lock, then raise a
-per-domain soft irq on the other CPU. The softirq will switch to
-init's PTs, then do an atomic inc of a variable to count himself in,
-then spin on a lock. Having noticed that the other guy has counted
-in, flush the shadow table, then release him by dropping the lock. He
-will then reload cr3 from mm.page_table on the way out of the softirq.
-
-In domian-softirq context we know that the guy holds no locks and has
-interrupts enabled. Nothing can go wrong ;-)
-
-**/
-
-static inline void free_shadow_page( struct mm_struct *m,
- struct pfn_info *pfn_info )
-{
- unsigned long flags;
- unsigned long type = pfn_info->type_and_flags & PGT_type_mask;
-
- m->shadow_page_count--;
-
- if (type == PGT_l1_page_table)
- perfc_decr(shadow_l1_pages);
- else if (type == PGT_l2_page_table)
- perfc_decr(shadow_l2_pages);
- else printk("Free shadow weird page type pfn=%08x type=%08x\n",
- frame_table-pfn_info, pfn_info->type_and_flags);
-
- pfn_info->type_and_flags = 0;
-
- spin_lock_irqsave(&free_list_lock, flags);
- list_add(&pfn_info->list, &free_list);
- free_pfns++;
- spin_unlock_irqrestore(&free_list_lock, flags);
-}
-
-static void __free_shadow_table( struct mm_struct *m )
-{
- int j, free=0;
- struct shadow_status *a,*next;
-
- // the code assumes you're not using the page tables i.e.
- // the domain is stopped and cr3 is something else!!
-
- // walk the hash table and call free_shadow_page on all pages
-
- shadow_audit(m,1);
-
- for(j=0;j<shadow_ht_buckets;j++)
- {
- a = &m->shadow_ht[j];
- if (a->pfn)
- {
- free_shadow_page( m,
- &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
- a->pfn = 0;
- a->spfn_and_flags = 0;
- free++;
- }
- next=a->next;
- a->next=NULL;
- a=next;
- while(a)
- {
- struct shadow_status *next = a->next;
-
- free_shadow_page( m,
- &frame_table[a->spfn_and_flags & PSH_pfn_mask] );
- a->pfn = 0;
- a->spfn_and_flags = 0;
- free++;
- a->next = m->shadow_ht_free;
- m->shadow_ht_free = a;
- a=next;
- }
- shadow_audit(m,0);
- }
- SH_LOG("Free shadow table. Freed= %d",free);
-}
-
-
-#define TABLE_OP_ZERO_L2 1
-#define TABLE_OP_ZERO_L1 2
-#define TABLE_OP_FREE_L1 3
-
-static inline int shadow_page_op( struct mm_struct *m, unsigned int op,
- unsigned int gpfn,
- struct pfn_info *spfn_info, int *work )
-{
- unsigned int spfn = spfn_info-frame_table;
- int restart = 0;
-
- switch( op )
- {
- case TABLE_OP_ZERO_L2:
- {
- if ( (spfn_info->type_and_flags & PGT_type_mask) ==
- PGT_l2_page_table )
- {
- unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
-#ifdef __i386__
- memset(spl1e, 0, DOMAIN_ENTRIES_PER_L2_PAGETABLE * sizeof(*spl1e));
-#endif
- unmap_domain_mem( spl1e );
- }
- }
- break;
-
- case TABLE_OP_ZERO_L1:
- {
- if ( (spfn_info->type_and_flags & PGT_type_mask) ==
- PGT_l1_page_table )
- {
- unsigned long * spl1e = map_domain_mem( spfn<<PAGE_SHIFT );
- memset( spl1e, 0, ENTRIES_PER_L1_PAGETABLE * sizeof(*spl1e) );
- unmap_domain_mem( spl1e );
- }
- }
- break;
-
- case TABLE_OP_FREE_L1:
- {
- if ( (spfn_info->type_and_flags & PGT_type_mask) ==
- PGT_l1_page_table )
- {
- // lock is already held
- delete_shadow_status( m, gpfn );
- restart = 1; // we need to go to start of list again
- }
- }
-
- break;
-
- default:
- BUG();
-
- }
- return restart;
-}
-
-static void __scan_shadow_table( struct mm_struct *m, unsigned int op )
-{
- int j, work=0;
- struct shadow_status *a, *next;
-
- // the code assumes you're not using the page tables i.e.
- // the domain is stopped and cr3 is something else!!
-
- // walk the hash table and call free_shadow_page on all pages
-
- shadow_audit(m,1);
-
- for(j=0;j<shadow_ht_buckets;j++)
- {
- retry:
- a = &m->shadow_ht[j];
- next = a->next;
- if (a->pfn)
- {
- if ( shadow_page_op( m, op, a->pfn,
- &frame_table[a->spfn_and_flags & PSH_pfn_mask],
- &work ) )
- goto retry;
- }
- a=next;
- while(a)
- {
- next = a->next;
- if ( shadow_page_op( m, op, a->pfn,
- &frame_table[a->spfn_and_flags & PSH_pfn_mask],
- &work ) )
- goto retry;
- a=next;
- }
- shadow_audit(m,0);
- }
- SH_VLOG("Scan shadow table. Work=%d l1=%d l2=%d", work, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
-}
-
-
-void shadow_mode_init(void)
-{
-}
-
-int shadow_mode_enable( struct domain *p, unsigned int mode )
-{
- struct mm_struct *m = &p->mm;
- struct shadow_status **fptr;
- int i;
-
- m->shadow_mode = mode;
-
- // allocate hashtable
- m->shadow_ht = kmalloc(shadow_ht_buckets *
- sizeof(struct shadow_status));
- if( m->shadow_ht == NULL )
- goto nomem;
-
- memset(m->shadow_ht, 0, shadow_ht_buckets * sizeof(struct shadow_status));
-
- // allocate space for first lot of extra nodes
- m->shadow_ht_extras = kmalloc(sizeof(void*) +
- (shadow_ht_extra_size *
- sizeof(struct shadow_status)));
- if( m->shadow_ht_extras == NULL )
- goto nomem;
-
- memset( m->shadow_ht_extras, 0, sizeof(void*) + (shadow_ht_extra_size *
- sizeof(struct shadow_status)) );
-
- m->shadow_extras_count++;
-
- // add extras to free list
- fptr = &m->shadow_ht_free;
- for ( i=0; i<shadow_ht_extra_size; i++ )
- {
- *fptr = &m->shadow_ht_extras[i];
- fptr = &(m->shadow_ht_extras[i].next);
- }
- *fptr = NULL;
- *((struct shadow_status ** )
- &m->shadow_ht_extras[shadow_ht_extra_size]) = NULL;
-
- if ( mode == SHM_logdirty )
- {
- m->shadow_dirty_bitmap_size = (p->max_pages+63)&(~63);
- m->shadow_dirty_bitmap =
- kmalloc( m->shadow_dirty_bitmap_size/8);
- if( m->shadow_dirty_bitmap == NULL )
- {
- m->shadow_dirty_bitmap_size = 0;
- goto nomem;
- }
- memset(m->shadow_dirty_bitmap,0,m->shadow_dirty_bitmap_size/8);
- }
-
- // call shadow_mk_pagetable
- __shadow_mk_pagetable( m );
- return 0;
-
-nomem:
- return -ENOMEM;
-}
-
-void shadow_mode_disable( struct domain *p )
-{
- struct mm_struct *m = &p->mm;
- struct shadow_status *next;
-
- __free_shadow_table( m );
- m->shadow_mode = 0;
-
- SH_LOG("freed tables count=%d l1=%d l2=%d",
- m->shadow_page_count, perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages));
-
- next = m->shadow_ht_extras;
- while( next )
- {
- struct shadow_status * this = next;
- m->shadow_extras_count--;
- next = *((struct shadow_status **)(&next[shadow_ht_extra_size]));
- kfree( this );
- }
-
- SH_LOG("freed extras, now %d", m->shadow_extras_count);
-
- if( m->shadow_dirty_bitmap )
- {
- kfree( m->shadow_dirty_bitmap );
- m->shadow_dirty_bitmap = 0;
- m->shadow_dirty_bitmap_size = 0;
- }
-
- // free the hashtable itself
- kfree( &m->shadow_ht[0] );
-}
-
-static int shadow_mode_table_op(struct domain *d,
- dom0_shadow_control_t *sc)
-{
- unsigned int op = sc->op;
- struct mm_struct *m = &d->mm;
- int rc = 0;
-
- // since Dom0 did the hypercall, we should be running with it's page
- // tables right now. Calling flush on yourself would be really
- // stupid.
-
- ASSERT(spin_is_locked(&d->mm.shadow_lock));
-
- if ( m == ¤t->mm )
- {
- printk("Don't try and flush your own page tables!\n");
- return -EINVAL;
- }
-
- SH_VLOG("shadow mode table op %08lx %08lx count %d",pagetable_val( m->pagetable),pagetable_val(m->shadow_table), m->shadow_page_count);
-
- shadow_audit(m,1);
-
- switch(op)
- {
- case DOM0_SHADOW_CONTROL_OP_FLUSH:
- // XXX THIS IS VERY DANGEROUS : MUST ENSURE THE PTs ARE NOT IN USE ON
- // OTHER CPU -- fix when we get sched sync pause.
- __free_shadow_table( m );
- break;
-
- case DOM0_SHADOW_CONTROL_OP_CLEAN: // zero all-non hypervisor
- {
- __scan_shadow_table( m, TABLE_OP_ZERO_L2 );
- __scan_shadow_table( m, TABLE_OP_ZERO_L1 );
-
- goto send_bitmap;
- }
-
-
- case DOM0_SHADOW_CONTROL_OP_CLEAN2: // zero all L2, free L1s
- {
- int i,j,zero=1;
-
- __scan_shadow_table( m, TABLE_OP_ZERO_L2 );
- __scan_shadow_table( m, TABLE_OP_FREE_L1 );
-
- send_bitmap:
- sc->stats.fault_count = d->mm.shadow_fault_count;
- sc->stats.dirty_count = d->mm.shadow_dirty_count;
- sc->stats.dirty_net_count = d->mm.shadow_dirty_net_count;
- sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
-
- d->mm.shadow_fault_count = 0;
- d->mm.shadow_dirty_count = 0;
- d->mm.shadow_dirty_net_count = 0;
- d->mm.shadow_dirty_block_count = 0;
-
- sc->pages = d->tot_pages;
-
- if( d->tot_pages > sc->pages ||
- !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
- {
- rc = -EINVAL;
- goto out;
- }
-
-
-#define chunk (8*1024) // do this in 1KB chunks for L1 cache
-
- for(i=0;i<d->tot_pages;i+=chunk)
- {
- int bytes = (( ((d->tot_pages-i) > (chunk))?
- (chunk):(d->tot_pages-i) ) + 7) / 8;
-
- copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
- d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
- bytes );
-
- for(j=0; zero && j<bytes/sizeof(unsigned long);j++)
- {
- if( d->mm.shadow_dirty_bitmap[j] != 0 )
- zero = 0;
- }
-
- memset( d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
- 0, bytes);
- }
-
- /* Might as well stop the domain as an optimization. */
- if ( zero )
- domain_pause_by_systemcontroller(d);
-
- break;
- }
-
- case DOM0_SHADOW_CONTROL_OP_PEEK:
- {
- int i;
-
- sc->stats.fault_count = d->mm.shadow_fault_count;
- sc->stats.dirty_count = d->mm.shadow_dirty_count;
- sc->stats.dirty_net_count = d->mm.shadow_dirty_net_count;
- sc->stats.dirty_block_count = d->mm.shadow_dirty_block_count;
-
- if( d->tot_pages > sc->pages ||
- !sc->dirty_bitmap || !d->mm.shadow_dirty_bitmap )
- {
- rc = -EINVAL;
- goto out;
- }
-
- sc->pages = d->tot_pages;
-
-#define chunk (8*1024) // do this in 1KB chunks for L1 cache
-
- for(i=0;i<d->tot_pages;i+=chunk)
- {
- int bytes = (( ((d->tot_pages-i) > (chunk))?
- (chunk):(d->tot_pages-i) ) + 7) / 8;
-
- copy_to_user( sc->dirty_bitmap + (i/(8*sizeof(unsigned long))),
- d->mm.shadow_dirty_bitmap +(i/(8*sizeof(unsigned long))),
- bytes );
- }
-
- break;
- }
-
- default:
- BUG();
-
- }
-
-
-out:
-
- SH_VLOG("shadow mode table op : page count %d", m->shadow_page_count);
-
- shadow_audit(m,1);
-
- // call shadow_mk_pagetable
- __shadow_mk_pagetable( m );
-
- return rc;
-}
-
-int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc )
-{
- unsigned int cmd = sc->op;
- int rc = 0;
-
- spin_lock(&p->mm.shadow_lock);
-
- if ( p->mm.shadow_mode && cmd == DOM0_SHADOW_CONTROL_OP_OFF )
- {
- shadow_mode_disable(p);
- }
- else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_TEST )
- {
- if(p->mm.shadow_mode) shadow_mode_disable(p);
- shadow_mode_enable(p, SHM_test);
- }
- else if ( cmd == DOM0_SHADOW_CONTROL_OP_ENABLE_LOGDIRTY )
- {
- if(p->mm.shadow_mode) shadow_mode_disable(p);
- shadow_mode_enable(p, SHM_logdirty);
- }
- else if ( p->mm.shadow_mode && cmd >= DOM0_SHADOW_CONTROL_OP_FLUSH && cmd<=DOM0_SHADOW_CONTROL_OP_CLEAN2 )
- {
- rc = shadow_mode_table_op(p, sc);
- }
- else
- {
- rc = -EINVAL;
- }
-
- flush_tlb_cpu(p->processor);
-
- spin_unlock(&p->mm.shadow_lock);
-
- return rc;
-}
-
-
-
-static inline struct pfn_info *alloc_shadow_page( struct mm_struct *m )
-{
- m->shadow_page_count++;
-
- return alloc_domain_page( NULL );
-}
-
-
-void unshadow_table( unsigned long gpfn, unsigned int type )
-{
- unsigned long spfn;
-
- SH_VLOG("unshadow_table type=%08x gpfn=%08lx",
- type,
- gpfn );
-
- perfc_incrc(unshadow_table_count);
-
- // this function is the same for both l1 and l2 tables
-
- // even in the SMP guest case, there won't be a race here as
- // this CPU was the one that cmpxchg'ed the page to invalid
-
- spfn = __shadow_status(¤t->mm, gpfn) & PSH_pfn_mask;
-
- delete_shadow_status(¤t->mm, gpfn);
-
- free_shadow_page( ¤t->mm, &frame_table[spfn] );
-
-}
-
-
-unsigned long shadow_l2_table(
- struct mm_struct *m, unsigned long gpfn )
-{
- struct pfn_info *spfn_info;
- unsigned long spfn;
- l2_pgentry_t *spl2e, *gpl2e;
- int i;
-
- SH_VVLOG("shadow_l2_table( %08lx )",gpfn);
-
- perfc_incrc(shadow_l2_table_count);
-
- // XXX in future, worry about racing in SMP guests
- // -- use cmpxchg with PSH_pending flag to show progress (and spin)
-
- spfn_info = alloc_shadow_page(m);
-
- ASSERT( spfn_info ); // XXX deal with failure later e.g. blow cache
-
- spfn_info->type_and_flags = PGT_l2_page_table;
- perfc_incr(shadow_l2_pages);
-
- spfn = (unsigned long) (spfn_info - frame_table);
-
- // mark pfn as being shadowed, update field to point at shadow
- set_shadow_status(m, gpfn, spfn | PSH_shadowed);
-
- // we need to do this before the linear map is set up
- spl2e = (l2_pgentry_t *) map_domain_mem(spfn << PAGE_SHIFT);
-
-#ifdef __i386__
- // get hypervisor and 2x linear PT mapings installed
- memcpy(&spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- &idle_pg_table[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- HYPERVISOR_ENTRIES_PER_L2_PAGETABLE * sizeof(l2_pgentry_t));
- spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
- mk_l2_pgentry((gpfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
- spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT] =
- mk_l2_pgentry((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR);
- spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT] =
- mk_l2_pgentry(__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) |
- __PAGE_HYPERVISOR);
-#endif
-
- // can't use the linear map as we may not be in the right PT
- gpl2e = (l2_pgentry_t *) map_domain_mem(gpfn << PAGE_SHIFT);
-
- // proactively create entries for pages that are already shadowed
- for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
- {
- unsigned long spte = 0;
-
-#if 0 // Turns out this doesn't really help
- unsigned long gpte;
-
- gpte = l2_pgentry_val(gpl2e[i]);
-
- if (gpte & _PAGE_PRESENT)
- {
- unsigned long s_sh =
- __shadow_status(p, gpte>>PAGE_SHIFT);
-
- l2pde_general( m, &gpte, &spte, s_sh );
-
- }
-#endif
-
- spl2e[i] = mk_l2_pgentry( spte );
-
- }
-
- // its arguable we should 'preemptively shadow' a few active L1 pages
- // to avoid taking a string of faults when 'jacking' a running domain
-
- unmap_domain_mem( gpl2e );
- unmap_domain_mem( spl2e );
-
- SH_VLOG("shadow_l2_table( %08lx -> %08lx)",gpfn,spfn);
-
- return spfn;
-}
-
-
-int shadow_fault( unsigned long va, long error_code )
-{
- unsigned long gpte, spte;
- struct mm_struct *m = ¤t->mm;
-
- SH_VVLOG("shadow_fault( va=%08lx, code=%ld )", va, error_code );
-
- check_pagetable( current, current->mm.pagetable, "pre-sf" );
-
- if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
- {
- SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
- return 0; // propagate to guest
- }
-
- if ( ! (gpte & _PAGE_PRESENT) )
- {
- SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
- return 0; // we're not going to be able to help
- }
-
- if ( (error_code & 2) && ! (gpte & _PAGE_RW) )
- {
- // write fault on RO page
- return 0;
- }
-
- // take the lock and reread gpte
-
- while( unlikely(!spin_trylock(¤t->mm.shadow_lock)) )
- {
- extern volatile unsigned long flush_cpumask;
- if ( test_and_clear_bit(smp_processor_id(), &flush_cpumask) )
- local_flush_tlb();
- rep_nop();
- }
-
- ASSERT(spin_is_locked(¤t->mm.shadow_lock));
-
- if ( unlikely(__get_user(gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
- {
- SH_VVLOG("shadow_fault - EXIT: read gpte faulted" );
- spin_unlock(&m->shadow_lock);
- return 0; // propagate to guest
- }
-
- if ( unlikely(!(gpte & _PAGE_PRESENT)) )
- {
- SH_VVLOG("shadow_fault - EXIT: gpte not present (%lx)",gpte );
- spin_unlock(&m->shadow_lock);
- return 0; // we're not going to be able to help
- }
-
- if ( error_code & 2 )
- { // write fault
- if ( likely(gpte & _PAGE_RW) )
- {
- l1pte_write_fault( m, &gpte, &spte );
- }
- else
- { // write fault on RO page
- SH_VVLOG("shadow_fault - EXIT: write fault on RO page (%lx)",gpte );
- spin_unlock(&m->shadow_lock);
- return 0; // propagate to guest
- // not clear whether we should set accessed bit here...
- }
- }
- else
- {
- l1pte_read_fault( m, &gpte, &spte );
- }
-
- SH_VVLOG("plan: gpte=%08lx spte=%08lx", gpte, spte );
-
- // write back updated gpte
- // XXX watch out for read-only L2 entries! (not used in Linux)
- if ( unlikely( __put_user( gpte, (unsigned long*)&linear_pg_table[va>>PAGE_SHIFT])) )
- BUG(); // fixme!
-
- if ( unlikely( __put_user( spte, (unsigned long*)&shadow_linear_pg_table[va>>PAGE_SHIFT])) )
- {
- // failed:
- // the L1 may not be shadowed, or the L2 entry may be insufficient
-
- unsigned long gpde, spde, gl1pfn, sl1pfn;
-
- SH_VVLOG("3: not shadowed or l2 insufficient gpte=%08lx spte=%08lx",gpte,spte );
-
- gpde = l2_pgentry_val(linear_l2_table[va>>L2_PAGETABLE_SHIFT]);
-
- gl1pfn = gpde>>PAGE_SHIFT;
-
-
- if ( ! (sl1pfn=__shadow_status(¤t->mm, gl1pfn) ) )
- {
- // this L1 is NOT already shadowed so we need to shadow it
- struct pfn_info *sl1pfn_info;
- unsigned long *gpl1e, *spl1e;
- int i;
- sl1pfn_info = alloc_shadow_page( ¤t->mm );
- sl1pfn_info->type_and_flags = PGT_l1_page_table;
-
- sl1pfn = sl1pfn_info - frame_table;
-
- SH_VVLOG("4a: l1 not shadowed ( %08lx )",sl1pfn);
- perfc_incrc(shadow_l1_table_count);
- perfc_incr(shadow_l1_pages);
-
- set_shadow_status(¤t->mm, gl1pfn, PSH_shadowed | sl1pfn);
-
- l2pde_general( m, &gpde, &spde, sl1pfn );
-
- linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
- shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
-
- gpl1e = (unsigned long *) &(linear_pg_table[
- (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ]);
-
- spl1e = (unsigned long *) &shadow_linear_pg_table[
- (va>>PAGE_SHIFT) & ~(ENTRIES_PER_L1_PAGETABLE-1) ];
-
-
- for ( i = 0; i < ENTRIES_PER_L1_PAGETABLE; i++ )
- {
- l1pte_no_fault( m, &gpl1e[i], &spl1e[i] );
- }
-
-
- }
- else
- {
- // this L1 was shadowed (by another PT) but we didn't have an L2
- // entry for it
-
- SH_VVLOG("4b: was shadowed, l2 missing ( %08lx )",sl1pfn);
-
- l2pde_general( m, &gpde, &spde, sl1pfn );
-
- linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(gpde);
- shadow_linear_l2_table[va>>L2_PAGETABLE_SHIFT] = mk_l2_pgentry(spde);
-
- }
-
- shadow_linear_pg_table[va>>PAGE_SHIFT] = mk_l1_pgentry(spte);
- // (we need to do the above even if we've just made the shadow L1)
-
- } // end of fixup writing the shadow L1 directly failed
-
- perfc_incrc(shadow_fixup_count);
-
- m->shadow_fault_count++;
-
- check_pagetable( current, current->mm.pagetable, "post-sf" );
-
- spin_unlock(&m->shadow_lock);
-
- return 1; // let's try the faulting instruction again...
-
-}
-
-
-void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
- unsigned long *prev_spfn_ptr,
- l1_pgentry_t **prev_spl1e_ptr )
-{
- unsigned long gpfn, spfn, spte, prev_spfn = *prev_spfn_ptr;
- l1_pgentry_t * spl1e, * prev_spl1e = *prev_spl1e_ptr;
-
-
- SH_VVLOG("shadow_l1_normal_pt_update pa=%08lx, gpte=%08lx, prev_spfn=%08lx, prev_spl1e=%p\n",
- pa,gpte,prev_spfn, prev_spl1e);
-
- // to get here, we know the l1 page *must* be shadowed
-
- gpfn = pa >> PAGE_SHIFT;
- spfn = __shadow_status(¤t->mm, gpfn) & PSH_pfn_mask;
-
- if ( spfn == prev_spfn )
- {
- spl1e = prev_spl1e;
- }
- else
- {
- if( prev_spl1e ) unmap_domain_mem( prev_spl1e );
- spl1e = (l1_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
- *prev_spfn_ptr = spfn;
- *prev_spl1e_ptr = spl1e;
- }
-
- // XXX we assume only pagetables can be shadowed;
- // this will have to change to allow arbitrary CoW etc.
-
- l1pte_no_fault( ¤t->mm, &gpte, &spte );
-
-
- spl1e[(pa & ~PAGE_MASK) / sizeof(l1_pgentry_t) ] = mk_l1_pgentry( spte );
-
-}
-
-void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte )
-{
- unsigned long gpfn, spfn, spte;
- l2_pgentry_t * sp2le;
- unsigned long s_sh=0;
-
- SH_VVLOG("shadow_l2_normal_pt_update pa=%08lx, gpte=%08lx",pa,gpte);
-
- // to get here, we know the l2 page has a shadow
-
- gpfn = pa >> PAGE_SHIFT;
- spfn = __shadow_status(¤t->mm, gpfn) & PSH_pfn_mask;
-
-
- spte = 0;
-
- if( gpte & _PAGE_PRESENT )
- s_sh = __shadow_status(¤t->mm, gpte >> PAGE_SHIFT);
-
- sp2le = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
- // no real need for a cache here
-
- l2pde_general( ¤t->mm, &gpte, &spte, s_sh );
-
- // XXXX Should mark guest pte as DIRTY and ACCESSED too!!!!!
-
- sp2le[(pa & ~PAGE_MASK) / sizeof(l2_pgentry_t) ] =
- mk_l2_pgentry( spte );
-
- unmap_domain_mem( (void *) sp2le );
-}
-
-
-#if SHADOW_DEBUG
-
-static int sh_l2_present;
-static int sh_l1_present;
-char * sh_check_name;
-
-#define FAIL(_f, _a...) \
-{printk("XXX %s-FAIL (%d,%d)" _f " g=%08lx s=%08lx\n", sh_check_name, level, i, ## _a , gpte, spte ); BUG();}
-
-static int check_pte( struct mm_struct *m,
- unsigned long gpte, unsigned long spte, int level, int i )
-{
- unsigned long mask, gpfn, spfn;
-
- if ( spte == 0 || spte == 0xdeadface || spte == 0x00000E00)
- return 1; // always safe
-
- if ( !(spte & _PAGE_PRESENT) )
- FAIL("Non zero not present spte");
-
- if( level == 2 ) sh_l2_present++;
- if( level == 1 ) sh_l1_present++;
-
- if ( !(gpte & _PAGE_PRESENT) )
- FAIL("Guest not present yet shadow is");
-
- mask = ~(_PAGE_DIRTY|_PAGE_ACCESSED|_PAGE_RW|0xFFFFF000);
-
- if ( (spte & mask) != (gpte & mask ) )
- FAIL("Corrupt?");
-
- if ( (spte & _PAGE_DIRTY ) && !(gpte & _PAGE_DIRTY) )
- FAIL("Dirty coherence");
-
- if ( (spte & _PAGE_ACCESSED ) && !(gpte & _PAGE_ACCESSED) )
- FAIL("Accessed coherence");
-
- if ( (spte & _PAGE_RW ) && !(gpte & _PAGE_RW) )
- FAIL("RW coherence");
-
- if ( (spte & _PAGE_RW ) && !((gpte & _PAGE_RW) && (gpte & _PAGE_DIRTY) ))
- FAIL("RW2 coherence");
-
- spfn = spte>>PAGE_SHIFT;
- gpfn = gpte>>PAGE_SHIFT;
-
- if ( gpfn == spfn )
- {
- if ( level > 1 )
- FAIL("Linear map ???"); // XXX this will fail on BSD
-
- return 1;
- }
- else
- {
- if ( level < 2 )
- FAIL("Shadow in L1 entry?");
-
- if ( __shadow_status(p, gpfn) != (PSH_shadowed | spfn) )
- FAIL("spfn problem g.sf=%08lx",
- __shadow_status(p, gpfn) );
- }
-
- return 1;
-}
-
-
-static int check_l1_table( struct mm_struct *m, unsigned long va,
- unsigned long g2, unsigned long s2 )
-{
- int j;
- unsigned long *gpl1e, *spl1e;
-
- //gpl1e = (unsigned long *) &(linear_pg_table[ va>>PAGE_SHIFT]);
- //spl1e = (unsigned long *) &(shadow_linear_pg_table[ va>>PAGE_SHIFT]);
-
- gpl1e = map_domain_mem( g2<<PAGE_SHIFT );
- spl1e = map_domain_mem( s2<<PAGE_SHIFT );
-
- for ( j = 0; j < ENTRIES_PER_L1_PAGETABLE; j++ )
- {
- unsigned long gpte = gpl1e[j];
- unsigned long spte = spl1e[j];
-
- check_pte( p, gpte, spte, 1, j );
- }
-
- unmap_domain_mem( spl1e );
- unmap_domain_mem( gpl1e );
-
- return 1;
-}
-
-#define FAILPT(_f, _a...) \
-{printk("XXX FAIL %s-PT" _f "\n", s, ## _a ); BUG();}
-
-int check_pagetable( struct mm_struct *m, pagetable_t pt, char *s )
-{
- unsigned long gptbase = pagetable_val(pt);
- unsigned long gpfn, spfn;
- int i;
- l2_pgentry_t *gpl2e, *spl2e;
-
- sh_check_name = s;
-
- SH_VVLOG("%s-PT Audit",s);
-
- sh_l2_present = sh_l1_present = 0;
-
- gpfn = gptbase >> PAGE_SHIFT;
-
- if ( ! (__shadow_status(p, gpfn) & PSH_shadowed) )
- {
- printk("%s-PT %08lx not shadowed\n", s, gptbase);
-
- if( __shadow_status(p, gpfn) != 0 ) BUG();
-
- return 0;
- }
-
- spfn = __shadow_status(p, gpfn) & PSH_pfn_mask;
-
- if ( ! __shadow_status(p, gpfn) == (PSH_shadowed | spfn) )
- FAILPT("ptbase shadow inconsistent1");
-
- gpl2e = (l2_pgentry_t *) map_domain_mem( gpfn << PAGE_SHIFT );
- spl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
-
- //ipl2e = (l2_pgentry_t *) map_domain_mem( spfn << PAGE_SHIFT );
-
-
- if ( memcmp( &spl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- &gpl2e[DOMAIN_ENTRIES_PER_L2_PAGETABLE],
- ((SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT))-DOMAIN_ENTRIES_PER_L2_PAGETABLE)
- * sizeof(l2_pgentry_t)) )
- {
- printk("gpfn=%08lx spfn=%08lx\n", gpfn, spfn);
- for (i=DOMAIN_ENTRIES_PER_L2_PAGETABLE;
- i<(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT));
- i++ )
- printk("+++ (%d) %08lx %08lx\n",i,
- l2_pgentry_val(gpl2e[i]), l2_pgentry_val(spl2e[i]) );
- FAILPT("hypervisor entries inconsistent");
- }
-
- if ( (l2_pgentry_val(spl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
- l2_pgentry_val(gpl2e[LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT])) )
- FAILPT("hypervisor linear map inconsistent");
-
- if ( (l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
- ((spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR)) )
- FAILPT("hypervisor shadow linear map inconsistent %08lx %08lx",
- l2_pgentry_val(spl2e[SH_LINEAR_PT_VIRT_START >> L2_PAGETABLE_SHIFT]),
- (spfn << PAGE_SHIFT) | __PAGE_HYPERVISOR
- );
-
- if ( (l2_pgentry_val(spl2e[PERDOMAIN_VIRT_START >> L2_PAGETABLE_SHIFT]) !=
- ((__pa(frame_table[gpfn].u.domain->mm.perdomain_pt) | __PAGE_HYPERVISOR))) )
- FAILPT("hypervisor per-domain map inconsistent");
-
-
- // check the whole L2
- for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
- {
- unsigned long gpte = l2_pgentry_val(gpl2e[i]);
- unsigned long spte = l2_pgentry_val(spl2e[i]);
-
- check_pte( p, gpte, spte, 2, i );
- }
-
-
- // go back and recurse
- for ( i = 0; i < DOMAIN_ENTRIES_PER_L2_PAGETABLE; i++ )
- {
- unsigned long gpte = l2_pgentry_val(gpl2e[i]);
- unsigned long spte = l2_pgentry_val(spl2e[i]);
-
- if ( spte )
- check_l1_table( p,
- i<<L2_PAGETABLE_SHIFT,
- gpte>>PAGE_SHIFT, spte>>PAGE_SHIFT );
-
- }
-
- unmap_domain_mem( spl2e );
- unmap_domain_mem( gpl2e );
-
- SH_VVLOG("PT verified : l2_present = %d, l1_present = %d\n",
- sh_l2_present, sh_l1_present );
-
- return 1;
-}
-
-
-#endif
--- /dev/null
+/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- */
+
+#ifndef _XEN_SHADOW_H
+#define _XEN_SHADOW_H
+
+#include <xen/config.h>
+#include <xen/types.h>
+#include <xen/perfc.h>
+#include <asm/processor.h>
+
+
+/* Shadow PT flag bits in pfn_info */
+#define PSH_shadowed (1<<31) /* page has a shadow. PFN points to shadow */
+#define PSH_pending (1<<29) /* page is in the process of being shadowed */
+#define PSH_pfn_mask ((1<<21)-1)
+
+/* Shadow PT operation mode : shadowmode variable in mm_struct */
+#define SHM_test (1) /* just run domain on shadow PTs */
+#define SHM_logdirty (2) /* log pages that are dirtied */
+#define SHM_translate (3) /* lookup machine pages in translation table */
+//#define SHM_cow (4) /* copy on write all dirtied pages */
+
+
+#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
+#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
+
+extern void shadow_mode_init(void);
+extern int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc );
+extern int shadow_fault( unsigned long va, long error_code );
+extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
+ unsigned long *prev_spfn_ptr,
+ l1_pgentry_t **prev_spl1e_ptr );
+extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte );
+extern void unshadow_table( unsigned long gpfn, unsigned int type );
+extern int shadow_mode_enable( struct domain *p, unsigned int mode );
+extern void shadow_mode_disable( struct domain *p );
+extern unsigned long shadow_l2_table(
+ struct mm_struct *m, unsigned long gpfn );
+
+#define SHADOW_DEBUG 0
+#define SHADOW_HASH_DEBUG 0
+#define SHADOW_OPTIMISE 1
+
+struct shadow_status {
+ unsigned long pfn; // gpfn
+ unsigned long spfn_and_flags; // spfn plus flags
+ struct shadow_status *next; // use pull-to-front list.
+};
+
+#define shadow_ht_extra_size 128 /*128*/
+#define shadow_ht_buckets 256 /*256*/
+
+#ifndef NDEBUG
+#define SH_LOG(_f, _a...) \
+printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
+ current->domain , __LINE__ , ## _a )
+#else
+#define SH_LOG(_f, _a...)
+#endif
+
+#if SHADOW_DEBUG
+#define SH_VLOG(_f, _a...) \
+ printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
+ current->domain , __LINE__ , ## _a )
+#else
+#define SH_VLOG(_f, _a...)
+#endif
+
+#if 0
+#define SH_VVLOG(_f, _a...) \
+ printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
+ current->domain , __LINE__ , ## _a )
+#else
+#define SH_VVLOG(_f, _a...)
+#endif
+
+
+/************************************************************************/
+
+#define shadow_mode(d) (d->mm.shadow_mode)
+#define shadow_lock_init(d) spin_lock_init(&d->mm.shadow_lock)
+
+/************************************************************************/
+
+static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn )
+{
+ unsigned int pfn;
+ int rc = 0;
+
+ ASSERT(spin_is_locked(&m->shadow_lock));
+
+ pfn = machine_to_phys_mapping[mfn];
+
+ /* We use values with the top bit set to mark MFNs that aren't
+ really part of the domain's psuedo-physical memory map e.g.
+ the shared info frame. Nothing to do here...
+ */
+ if ( unlikely(pfn & 0x80000000U) ) return rc;
+
+ ASSERT(m->shadow_dirty_bitmap);
+ if( likely(pfn<m->shadow_dirty_bitmap_size) )
+ {
+ /* These updates occur with mm.shadow_lock held, so use
+ (__) version of test_and_set */
+ if( __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) == 0 )
+ {
+ // if we set it
+ m->shadow_dirty_count++;
+ rc = 1;
+ }
+ }
+ else
+ {
+ extern void show_traceX(void);
+ SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
+ mfn, pfn, m->shadow_dirty_bitmap_size, m );
+ SH_LOG("dom=%u caf=%08x taf=%08x\n",
+ frame_table[mfn].u.domain->domain,
+ frame_table[mfn].count_and_flags,
+ frame_table[mfn].type_and_flags );
+ }
+
+ return rc;
+}
+
+
+static inline int mark_dirty( struct mm_struct *m, unsigned int mfn )
+{
+ int rc;
+ ASSERT(local_irq_is_enabled());
+ //if(spin_is_locked(&m->shadow_lock)) printk("+");
+ spin_lock(&m->shadow_lock);
+ rc = __mark_dirty( m, mfn );
+ spin_unlock(&m->shadow_lock);
+ return rc;
+}
+
+
+/************************************************************************/
+
+static inline void l1pte_write_fault( struct mm_struct *m,
+ unsigned long *gpte_p, unsigned long *spte_p )
+{
+ unsigned long gpte = *gpte_p;
+ unsigned long spte = *spte_p;
+
+ switch( m->shadow_mode )
+ {
+ case SHM_test:
+ spte = gpte;
+ gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
+ spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
+ break;
+
+ case SHM_logdirty:
+ spte = gpte;
+ gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
+ spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
+ __mark_dirty( m, (gpte >> PAGE_SHIFT) );
+ break;
+ }
+
+ *gpte_p = gpte;
+ *spte_p = spte;
+}
+
+static inline void l1pte_read_fault( struct mm_struct *m,
+ unsigned long *gpte_p, unsigned long *spte_p )
+{
+ unsigned long gpte = *gpte_p;
+ unsigned long spte = *spte_p;
+
+ switch( m->shadow_mode )
+ {
+ case SHM_test:
+ spte = gpte;
+ gpte |= _PAGE_ACCESSED;
+ spte |= _PAGE_ACCESSED;
+ if ( ! (gpte & _PAGE_DIRTY ) )
+ spte &= ~ _PAGE_RW;
+ break;
+
+ case SHM_logdirty:
+ spte = gpte;
+ gpte |= _PAGE_ACCESSED;
+ spte |= _PAGE_ACCESSED;
+ spte &= ~ _PAGE_RW;
+ break;
+ }
+
+ *gpte_p = gpte;
+ *spte_p = spte;
+}
+
+static inline void l1pte_no_fault( struct mm_struct *m,
+ unsigned long *gpte_p, unsigned long *spte_p )
+{
+ unsigned long gpte = *gpte_p;
+ unsigned long spte = *spte_p;
+
+ switch( m->shadow_mode )
+ {
+ case SHM_test:
+ spte = 0;
+ if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
+ (_PAGE_PRESENT|_PAGE_ACCESSED) )
+ {
+ spte = gpte;
+ if ( ! (gpte & _PAGE_DIRTY ) )
+ spte &= ~ _PAGE_RW;
+ }
+ break;
+
+ case SHM_logdirty:
+ spte = 0;
+ if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
+ (_PAGE_PRESENT|_PAGE_ACCESSED) )
+ {
+ spte = gpte;
+ spte &= ~ _PAGE_RW;
+ }
+
+ break;
+ }
+
+ *gpte_p = gpte;
+ *spte_p = spte;
+}
+
+static inline void l2pde_general( struct mm_struct *m,
+ unsigned long *gpde_p, unsigned long *spde_p,
+ unsigned long sl1pfn)
+{
+ unsigned long gpde = *gpde_p;
+ unsigned long spde = *spde_p;
+
+ spde = 0;
+
+ if ( sl1pfn )
+ {
+ spde = (gpde & ~PAGE_MASK) | (sl1pfn<<PAGE_SHIFT) |
+ _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
+ gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY;
+
+ if ( unlikely( (sl1pfn<<PAGE_SHIFT) == (gpde & PAGE_MASK) ) )
+ {
+ // detect linear map, and keep pointing at guest
+ SH_VLOG("4c: linear mapping ( %08lx )",sl1pfn);
+ spde = gpde & ~_PAGE_RW;
+ }
+ }
+
+ *gpde_p = gpde;
+ *spde_p = spde;
+}
+
+/*********************************************************************/
+
+
+
+#if SHADOW_HASH_DEBUG
+static void shadow_audit(struct mm_struct *m, int print)
+{
+ int live=0, free=0, j=0, abs;
+ struct shadow_status *a;
+
+ for( j = 0; j < shadow_ht_buckets; j++ )
+ {
+ a = &m->shadow_ht[j];
+ if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);}
+ ASSERT((a->pfn&0xf0000000)==0);
+ ASSERT(a->pfn<0x00100000);
+ a=a->next;
+ while(a && live<9999)
+ {
+ live++;
+ if(a->pfn == 0 || a->spfn_and_flags == 0)
+ {
+ printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
+ live, a->pfn, a->spfn_and_flags, a->next);
+ BUG();
+ }
+ ASSERT(a->pfn);
+ ASSERT((a->pfn&0xf0000000)==0);
+ ASSERT(a->pfn<0x00100000);
+ ASSERT(a->spfn_and_flags&PSH_pfn_mask);
+ a=a->next;
+ }
+ ASSERT(live<9999);
+ }
+
+ a = m->shadow_ht_free;
+ while(a) { free++; a=a->next; }
+
+ if(print) printk("Xlive=%d free=%d\n",live,free);
+
+ abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live;
+ if( abs < -1 || abs > 1 )
+ {
+ printk("live=%d free=%d l1=%d l2=%d\n",live,free,
+ perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) );
+ BUG();
+ }
+
+}
+
+#else
+#define shadow_audit(p, print)
+#endif
+
+
+
+static inline struct shadow_status* hash_bucket( struct mm_struct *m,
+ unsigned int gpfn )
+{
+ return &(m->shadow_ht[gpfn % shadow_ht_buckets]);
+}
+
+
+static inline unsigned long __shadow_status( struct mm_struct *m,
+ unsigned int gpfn )
+{
+ struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn );
+
+ b = B;
+ ob = NULL;
+
+ SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b );
+ shadow_audit(m,0); // if in debug mode
+
+ do
+ {
+ if ( b->pfn == gpfn )
+ {
+ unsigned long t;
+ struct shadow_status *x;
+
+ // swap with head
+ t=B->pfn; B->pfn=b->pfn; b->pfn=t;
+ t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags;
+ b->spfn_and_flags=t;
+
+ if( ob )
+ { // pull to front
+ *ob=b->next;
+ x=B->next;
+ B->next=b;
+ b->next=x;
+ }
+ return B->spfn_and_flags;
+ }
+#if SHADOW_HASH_DEBUG
+ else
+ {
+ if(b!=B)ASSERT(b->pfn);
+ }
+#endif
+ ob=&b->next;
+ b=b->next;
+ }
+ while (b);
+
+ return 0;
+}
+
+/* we can make this locking more fine grained e.g. per shadow page if it
+ever becomes a problem, but since we need a spin lock on the hash table
+anyway its probably not worth being too clever. */
+
+static inline unsigned long get_shadow_status( struct mm_struct *m,
+ unsigned int gpfn )
+{
+ unsigned long res;
+
+ /* If we get here, we know that this domain is running in shadow mode.
+ We also know that some sort of update has happened to the underlying
+ page table page: either a PTE has been updated, or the page has
+ changed type. If we're in log dirty mode, we should set the approrpiate
+ bit in the dirty bitmap.
+ NB: the VA update path doesn't use this so needs to be handled
+ independnetly.
+ */
+
+ ASSERT(local_irq_is_enabled());
+ //if(spin_is_locked(&m->shadow_lock)) printk("*");
+ spin_lock(&m->shadow_lock);
+
+ if( m->shadow_mode == SHM_logdirty )
+ __mark_dirty( m, gpfn );
+
+ res = __shadow_status( m, gpfn );
+ if (!res) spin_unlock(&m->shadow_lock);
+ return res;
+}
+
+
+static inline void put_shadow_status( struct mm_struct *m )
+{
+ spin_unlock(&m->shadow_lock);
+}
+
+
+static inline void delete_shadow_status( struct mm_struct *m,
+ unsigned int gpfn )
+{
+ struct shadow_status *b, *B, **ob;
+
+ ASSERT(spin_is_locked(&m->shadow_lock));
+
+ B = b = hash_bucket( m, gpfn );
+
+ SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b );
+ shadow_audit(m,0);
+ ASSERT(gpfn);
+
+ if( b->pfn == gpfn )
+ {
+ if (b->next)
+ {
+ struct shadow_status *D=b->next;
+ b->spfn_and_flags = b->next->spfn_and_flags;
+ b->pfn = b->next->pfn;
+
+ b->next = b->next->next;
+ D->next = m->shadow_ht_free;
+ D->pfn = 0;
+ D->spfn_and_flags = 0;
+ m->shadow_ht_free = D;
+ }
+ else
+ {
+ b->pfn = 0;
+ b->spfn_and_flags = 0;
+ }
+
+#if SHADOW_HASH_DEBUG
+ if( __shadow_status(m,gpfn) ) BUG();
+ shadow_audit(m,0);
+#endif
+ return;
+ }
+
+ ob = &b->next;
+ b=b->next;
+
+ do
+ {
+ if ( b->pfn == gpfn )
+ {
+ b->pfn = 0;
+ b->spfn_and_flags = 0;
+
+ // b is in the list
+ *ob=b->next;
+ b->next = m->shadow_ht_free;
+ m->shadow_ht_free = b;
+
+#if SHADOW_HASH_DEBUG
+ if( __shadow_status(m,gpfn) ) BUG();
+#endif
+ shadow_audit(m,0);
+ return;
+ }
+
+ ob = &b->next;
+ b=b->next;
+ }
+ while (b);
+
+ // if we got here, it wasn't in the list
+ BUG();
+}
+
+
+static inline void set_shadow_status( struct mm_struct *m,
+ unsigned int gpfn, unsigned long s )
+{
+ struct shadow_status *b, *B, *extra, **fptr;
+ int i;
+
+ ASSERT(spin_is_locked(&m->shadow_lock));
+
+ B = b = hash_bucket( m, gpfn );
+
+ ASSERT(gpfn);
+ SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next );
+
+ shadow_audit(m,0);
+
+ do
+ {
+ if ( b->pfn == gpfn )
+ {
+ b->spfn_and_flags = s;
+ shadow_audit(m,0);
+ return;
+ }
+
+ b=b->next;
+ }
+ while (b);
+
+ // if we got here, this is an insert rather than update
+
+ ASSERT( s ); // deletes must have succeeded by here
+
+ if ( B->pfn == 0 )
+ {
+ // we can use this head
+ ASSERT( B->next == 0 );
+ B->pfn = gpfn;
+ B->spfn_and_flags = s;
+ shadow_audit(m,0);
+ return;
+ }
+
+ if( unlikely(m->shadow_ht_free == NULL) )
+ {
+ SH_LOG("allocate more shadow hashtable blocks");
+
+ // we need to allocate more space
+ extra = kmalloc(sizeof(void*) + (shadow_ht_extra_size *
+ sizeof(struct shadow_status)));
+
+ if( ! extra ) BUG(); // should be more graceful here....
+
+ memset(extra, 0, sizeof(void*) + (shadow_ht_extra_size *
+ sizeof(struct shadow_status)));
+
+ m->shadow_extras_count++;
+
+ // add extras to free list
+ fptr = &m->shadow_ht_free;
+ for ( i=0; i<shadow_ht_extra_size; i++ )
+ {
+ *fptr = &extra[i];
+ fptr = &(extra[i].next);
+ }
+ *fptr = NULL;
+
+ *((struct shadow_status ** ) &extra[shadow_ht_extra_size]) =
+ m->shadow_ht_extras;
+ m->shadow_ht_extras = extra;
+
+ }
+
+ // should really put this in B to go right to front
+ b = m->shadow_ht_free;
+ m->shadow_ht_free = b->next;
+ b->spfn_and_flags = s;
+ b->pfn = gpfn;
+ b->next = B->next;
+ B->next = b;
+
+ shadow_audit(m,0);
+
+ return;
+}
+
+static inline void __shadow_mk_pagetable( struct mm_struct *mm )
+{
+ unsigned long gpfn, spfn=0;
+
+ gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT;
+
+ if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) )
+ {
+ spfn = shadow_l2_table(mm, gpfn );
+ }
+ mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
+}
+
+static inline void shadow_mk_pagetable( struct mm_struct *mm )
+{
+ SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
+ pagetable_val(mm->pagetable), mm->shadow_mode );
+
+ if ( unlikely(mm->shadow_mode) )
+ {
+ ASSERT(local_irq_is_enabled());
+ spin_lock(&mm->shadow_lock);
+
+ __shadow_mk_pagetable( mm );
+
+ spin_unlock(&mm->shadow_lock);
+ }
+
+ SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
+ pagetable_val(mm->pagetable), mm->shadow_mode,
+ pagetable_val(mm->shadow_table) );
+
+}
+
+
+#if SHADOW_DEBUG
+extern int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s);
+#else
+#define check_pagetable(m, pt, s) ((void)0)
+#endif
+
+
+#endif /* XEN_SHADOW_H */
+
+
-/* -*- Mode:C; c-basic-offset:4; tab-width:4 -*- */
-
-#ifndef _XEN_SHADOW_H
-#define _XEN_SHADOW_H
-
-#include <xen/config.h>
-#include <xen/types.h>
-#include <xen/perfc.h>
-#include <asm/processor.h>
-
-
-/* Shadow PT flag bits in pfn_info */
-#define PSH_shadowed (1<<31) /* page has a shadow. PFN points to shadow */
-#define PSH_pending (1<<29) /* page is in the process of being shadowed */
-#define PSH_pfn_mask ((1<<21)-1)
-
-/* Shadow PT operation mode : shadowmode variable in mm_struct */
-#define SHM_test (1) /* just run domain on shadow PTs */
-#define SHM_logdirty (2) /* log pages that are dirtied */
-#define SHM_translate (3) /* lookup machine pages in translation table */
-//#define SHM_cow (4) /* copy on write all dirtied pages */
-
-
-#define shadow_linear_pg_table ((l1_pgentry_t *)SH_LINEAR_PT_VIRT_START)
-#define shadow_linear_l2_table ((l2_pgentry_t *)(SH_LINEAR_PT_VIRT_START+(SH_LINEAR_PT_VIRT_START>>(L2_PAGETABLE_SHIFT-L1_PAGETABLE_SHIFT))))
-
-extern void shadow_mode_init(void);
-extern int shadow_mode_control( struct domain *p, dom0_shadow_control_t *sc );
-extern int shadow_fault( unsigned long va, long error_code );
-extern void shadow_l1_normal_pt_update( unsigned long pa, unsigned long gpte,
- unsigned long *prev_spfn_ptr,
- l1_pgentry_t **prev_spl1e_ptr );
-extern void shadow_l2_normal_pt_update( unsigned long pa, unsigned long gpte );
-extern void unshadow_table( unsigned long gpfn, unsigned int type );
-extern int shadow_mode_enable( struct domain *p, unsigned int mode );
-extern void shadow_mode_disable( struct domain *p );
-extern unsigned long shadow_l2_table(
- struct mm_struct *m, unsigned long gpfn );
-
-#define SHADOW_DEBUG 0
-#define SHADOW_HASH_DEBUG 0
-#define SHADOW_OPTIMISE 1
-
-struct shadow_status {
- unsigned long pfn; // gpfn
- unsigned long spfn_and_flags; // spfn plus flags
- struct shadow_status *next; // use pull-to-front list.
-};
-
-#define shadow_ht_extra_size 128 /*128*/
-#define shadow_ht_buckets 256 /*256*/
-
-#ifndef NDEBUG
-#define SH_LOG(_f, _a...) \
-printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
- current->domain , __LINE__ , ## _a )
-#else
-#define SH_LOG(_f, _a...)
-#endif
-
-#if SHADOW_DEBUG
-#define SH_VLOG(_f, _a...) \
- printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
- current->domain , __LINE__ , ## _a )
-#else
-#define SH_VLOG(_f, _a...)
-#endif
-
-#if 0
-#define SH_VVLOG(_f, _a...) \
- printk("DOM%u: (file=shadow.c, line=%d) " _f "\n", \
- current->domain , __LINE__ , ## _a )
-#else
-#define SH_VVLOG(_f, _a...)
-#endif
-
-
-/************************************************************************/
-
-static inline int __mark_dirty( struct mm_struct *m, unsigned int mfn )
-{
- unsigned int pfn;
- int rc = 0;
-
- ASSERT(spin_is_locked(&m->shadow_lock));
-
- pfn = machine_to_phys_mapping[mfn];
-
- /* We use values with the top bit set to mark MFNs that aren't
- really part of the domain's psuedo-physical memory map e.g.
- the shared info frame. Nothing to do here...
- */
- if ( unlikely(pfn & 0x80000000U) ) return rc;
-
- ASSERT(m->shadow_dirty_bitmap);
- if( likely(pfn<m->shadow_dirty_bitmap_size) )
- {
- /* These updates occur with mm.shadow_lock held, so use
- (__) version of test_and_set */
- if( __test_and_set_bit( pfn, m->shadow_dirty_bitmap ) == 0 )
- {
- // if we set it
- m->shadow_dirty_count++;
- rc = 1;
- }
- }
- else
- {
- extern void show_traceX(void);
- SH_LOG("mark_dirty OOR! mfn=%x pfn=%x max=%x (mm %p)",
- mfn, pfn, m->shadow_dirty_bitmap_size, m );
- SH_LOG("dom=%u caf=%08x taf=%08x\n",
- frame_table[mfn].u.domain->domain,
- frame_table[mfn].count_and_flags,
- frame_table[mfn].type_and_flags );
- }
-
- return rc;
-}
-
-
-static inline int mark_dirty( struct mm_struct *m, unsigned int mfn )
-{
- int rc;
- ASSERT(local_irq_is_enabled());
- //if(spin_is_locked(&m->shadow_lock)) printk("+");
- spin_lock(&m->shadow_lock);
- rc = __mark_dirty( m, mfn );
- spin_unlock(&m->shadow_lock);
- return rc;
-}
-
-
-/************************************************************************/
-
-static inline void l1pte_write_fault( struct mm_struct *m,
- unsigned long *gpte_p, unsigned long *spte_p )
-{
- unsigned long gpte = *gpte_p;
- unsigned long spte = *spte_p;
-
- switch( m->shadow_mode )
- {
- case SHM_test:
- spte = gpte;
- gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
- spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
- break;
-
- case SHM_logdirty:
- spte = gpte;
- gpte |= _PAGE_DIRTY | _PAGE_ACCESSED;
- spte |= _PAGE_RW | _PAGE_DIRTY | _PAGE_ACCESSED;
- __mark_dirty( m, (gpte >> PAGE_SHIFT) );
- break;
- }
-
- *gpte_p = gpte;
- *spte_p = spte;
-}
-
-static inline void l1pte_read_fault( struct mm_struct *m,
- unsigned long *gpte_p, unsigned long *spte_p )
-{
- unsigned long gpte = *gpte_p;
- unsigned long spte = *spte_p;
-
- switch( m->shadow_mode )
- {
- case SHM_test:
- spte = gpte;
- gpte |= _PAGE_ACCESSED;
- spte |= _PAGE_ACCESSED;
- if ( ! (gpte & _PAGE_DIRTY ) )
- spte &= ~ _PAGE_RW;
- break;
-
- case SHM_logdirty:
- spte = gpte;
- gpte |= _PAGE_ACCESSED;
- spte |= _PAGE_ACCESSED;
- spte &= ~ _PAGE_RW;
- break;
- }
-
- *gpte_p = gpte;
- *spte_p = spte;
-}
-
-static inline void l1pte_no_fault( struct mm_struct *m,
- unsigned long *gpte_p, unsigned long *spte_p )
-{
- unsigned long gpte = *gpte_p;
- unsigned long spte = *spte_p;
-
- switch( m->shadow_mode )
- {
- case SHM_test:
- spte = 0;
- if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
- (_PAGE_PRESENT|_PAGE_ACCESSED) )
- {
- spte = gpte;
- if ( ! (gpte & _PAGE_DIRTY ) )
- spte &= ~ _PAGE_RW;
- }
- break;
-
- case SHM_logdirty:
- spte = 0;
- if ( (gpte & (_PAGE_PRESENT|_PAGE_ACCESSED) ) ==
- (_PAGE_PRESENT|_PAGE_ACCESSED) )
- {
- spte = gpte;
- spte &= ~ _PAGE_RW;
- }
-
- break;
- }
-
- *gpte_p = gpte;
- *spte_p = spte;
-}
-
-static inline void l2pde_general( struct mm_struct *m,
- unsigned long *gpde_p, unsigned long *spde_p,
- unsigned long sl1pfn)
-{
- unsigned long gpde = *gpde_p;
- unsigned long spde = *spde_p;
-
- spde = 0;
-
- if ( sl1pfn )
- {
- spde = (gpde & ~PAGE_MASK) | (sl1pfn<<PAGE_SHIFT) |
- _PAGE_RW | _PAGE_ACCESSED | _PAGE_DIRTY;
- gpde = gpde | _PAGE_ACCESSED | _PAGE_DIRTY;
-
- if ( unlikely( (sl1pfn<<PAGE_SHIFT) == (gpde & PAGE_MASK) ) )
- {
- // detect linear map, and keep pointing at guest
- SH_VLOG("4c: linear mapping ( %08lx )",sl1pfn);
- spde = gpde & ~_PAGE_RW;
- }
- }
-
- *gpde_p = gpde;
- *spde_p = spde;
-}
-
-/*********************************************************************/
-
-
-
-#if SHADOW_HASH_DEBUG
-static void shadow_audit(struct mm_struct *m, int print)
-{
- int live=0, free=0, j=0, abs;
- struct shadow_status *a;
-
- for( j = 0; j < shadow_ht_buckets; j++ )
- {
- a = &m->shadow_ht[j];
- if(a->pfn){live++; ASSERT(a->spfn_and_flags&PSH_pfn_mask);}
- ASSERT((a->pfn&0xf0000000)==0);
- ASSERT(a->pfn<0x00100000);
- a=a->next;
- while(a && live<9999)
- {
- live++;
- if(a->pfn == 0 || a->spfn_and_flags == 0)
- {
- printk("XXX live=%d pfn=%08lx sp=%08lx next=%p\n",
- live, a->pfn, a->spfn_and_flags, a->next);
- BUG();
- }
- ASSERT(a->pfn);
- ASSERT((a->pfn&0xf0000000)==0);
- ASSERT(a->pfn<0x00100000);
- ASSERT(a->spfn_and_flags&PSH_pfn_mask);
- a=a->next;
- }
- ASSERT(live<9999);
- }
-
- a = m->shadow_ht_free;
- while(a) { free++; a=a->next; }
-
- if(print) printk("Xlive=%d free=%d\n",live,free);
-
- abs=(perfc_value(shadow_l1_pages)+perfc_value(shadow_l2_pages))-live;
- if( abs < -1 || abs > 1 )
- {
- printk("live=%d free=%d l1=%d l2=%d\n",live,free,
- perfc_value(shadow_l1_pages), perfc_value(shadow_l2_pages) );
- BUG();
- }
-
-}
-
-#else
-#define shadow_audit(p, print)
-#endif
-
-
-
-static inline struct shadow_status* hash_bucket( struct mm_struct *m,
- unsigned int gpfn )
-{
- return &(m->shadow_ht[gpfn % shadow_ht_buckets]);
-}
-
-
-static inline unsigned long __shadow_status( struct mm_struct *m,
- unsigned int gpfn )
-{
- struct shadow_status **ob, *b, *B = hash_bucket( m, gpfn );
-
- b = B;
- ob = NULL;
-
- SH_VVLOG("lookup gpfn=%08x bucket=%p", gpfn, b );
- shadow_audit(m,0); // if in debug mode
-
- do
- {
- if ( b->pfn == gpfn )
- {
- unsigned long t;
- struct shadow_status *x;
-
- // swap with head
- t=B->pfn; B->pfn=b->pfn; b->pfn=t;
- t=B->spfn_and_flags; B->spfn_and_flags=b->spfn_and_flags;
- b->spfn_and_flags=t;
-
- if( ob )
- { // pull to front
- *ob=b->next;
- x=B->next;
- B->next=b;
- b->next=x;
- }
- return B->spfn_and_flags;
- }
-#if SHADOW_HASH_DEBUG
- else
- {
- if(b!=B)ASSERT(b->pfn);
- }
-#endif
- ob=&b->next;
- b=b->next;
- }
- while (b);
-
- return 0;
-}
-
-/* we can make this locking more fine grained e.g. per shadow page if it
-ever becomes a problem, but since we need a spin lock on the hash table
-anyway its probably not worth being too clever. */
-
-static inline unsigned long get_shadow_status( struct mm_struct *m,
- unsigned int gpfn )
-{
- unsigned long res;
-
- /* If we get here, we know that this domain is running in shadow mode.
- We also know that some sort of update has happened to the underlying
- page table page: either a PTE has been updated, or the page has
- changed type. If we're in log dirty mode, we should set the approrpiate
- bit in the dirty bitmap.
- NB: the VA update path doesn't use this so needs to be handled
- independnetly.
- */
-
- ASSERT(local_irq_is_enabled());
- //if(spin_is_locked(&m->shadow_lock)) printk("*");
- spin_lock(&m->shadow_lock);
-
- if( m->shadow_mode == SHM_logdirty )
- __mark_dirty( m, gpfn );
-
- res = __shadow_status( m, gpfn );
- if (!res) spin_unlock(&m->shadow_lock);
- return res;
-}
-
-
-static inline void put_shadow_status( struct mm_struct *m )
-{
- spin_unlock(&m->shadow_lock);
-}
-
-
-static inline void delete_shadow_status( struct mm_struct *m,
- unsigned int gpfn )
-{
- struct shadow_status *b, *B, **ob;
-
- ASSERT(spin_is_locked(&m->shadow_lock));
-
- B = b = hash_bucket( m, gpfn );
-
- SH_VVLOG("delete gpfn=%08x bucket=%p", gpfn, b );
- shadow_audit(m,0);
- ASSERT(gpfn);
-
- if( b->pfn == gpfn )
- {
- if (b->next)
- {
- struct shadow_status *D=b->next;
- b->spfn_and_flags = b->next->spfn_and_flags;
- b->pfn = b->next->pfn;
-
- b->next = b->next->next;
- D->next = m->shadow_ht_free;
- D->pfn = 0;
- D->spfn_and_flags = 0;
- m->shadow_ht_free = D;
- }
- else
- {
- b->pfn = 0;
- b->spfn_and_flags = 0;
- }
-
-#if SHADOW_HASH_DEBUG
- if( __shadow_status(m,gpfn) ) BUG();
- shadow_audit(m,0);
-#endif
- return;
- }
-
- ob = &b->next;
- b=b->next;
-
- do
- {
- if ( b->pfn == gpfn )
- {
- b->pfn = 0;
- b->spfn_and_flags = 0;
-
- // b is in the list
- *ob=b->next;
- b->next = m->shadow_ht_free;
- m->shadow_ht_free = b;
-
-#if SHADOW_HASH_DEBUG
- if( __shadow_status(m,gpfn) ) BUG();
-#endif
- shadow_audit(m,0);
- return;
- }
-
- ob = &b->next;
- b=b->next;
- }
- while (b);
-
- // if we got here, it wasn't in the list
- BUG();
-}
-
-
-static inline void set_shadow_status( struct mm_struct *m,
- unsigned int gpfn, unsigned long s )
-{
- struct shadow_status *b, *B, *extra, **fptr;
- int i;
-
- ASSERT(spin_is_locked(&m->shadow_lock));
-
- B = b = hash_bucket( m, gpfn );
-
- ASSERT(gpfn);
- SH_VVLOG("set gpfn=%08x s=%08lx bucket=%p(%p)", gpfn, s, b, b->next );
-
- shadow_audit(m,0);
-
- do
- {
- if ( b->pfn == gpfn )
- {
- b->spfn_and_flags = s;
- shadow_audit(m,0);
- return;
- }
-
- b=b->next;
- }
- while (b);
-
- // if we got here, this is an insert rather than update
-
- ASSERT( s ); // deletes must have succeeded by here
-
- if ( B->pfn == 0 )
- {
- // we can use this head
- ASSERT( B->next == 0 );
- B->pfn = gpfn;
- B->spfn_and_flags = s;
- shadow_audit(m,0);
- return;
- }
-
- if( unlikely(m->shadow_ht_free == NULL) )
- {
- SH_LOG("allocate more shadow hashtable blocks");
-
- // we need to allocate more space
- extra = kmalloc(sizeof(void*) + (shadow_ht_extra_size *
- sizeof(struct shadow_status)));
-
- if( ! extra ) BUG(); // should be more graceful here....
-
- memset(extra, 0, sizeof(void*) + (shadow_ht_extra_size *
- sizeof(struct shadow_status)));
-
- m->shadow_extras_count++;
-
- // add extras to free list
- fptr = &m->shadow_ht_free;
- for ( i=0; i<shadow_ht_extra_size; i++ )
- {
- *fptr = &extra[i];
- fptr = &(extra[i].next);
- }
- *fptr = NULL;
-
- *((struct shadow_status ** ) &extra[shadow_ht_extra_size]) =
- m->shadow_ht_extras;
- m->shadow_ht_extras = extra;
-
- }
-
- // should really put this in B to go right to front
- b = m->shadow_ht_free;
- m->shadow_ht_free = b->next;
- b->spfn_and_flags = s;
- b->pfn = gpfn;
- b->next = B->next;
- B->next = b;
-
- shadow_audit(m,0);
-
- return;
-}
-
-static inline void __shadow_mk_pagetable( struct mm_struct *mm )
-{
- unsigned long gpfn, spfn=0;
-
- gpfn = pagetable_val(mm->pagetable) >> PAGE_SHIFT;
-
- if ( unlikely((spfn=__shadow_status(mm, gpfn)) == 0 ) )
- {
- spfn = shadow_l2_table(mm, gpfn );
- }
- mm->shadow_table = mk_pagetable(spfn<<PAGE_SHIFT);
-}
-
-static inline void shadow_mk_pagetable( struct mm_struct *mm )
-{
- SH_VVLOG("shadow_mk_pagetable( gptbase=%08lx, mode=%d )",
- pagetable_val(mm->pagetable), mm->shadow_mode );
-
- if ( unlikely(mm->shadow_mode) )
- {
- ASSERT(local_irq_is_enabled());
- spin_lock(&mm->shadow_lock);
-
- __shadow_mk_pagetable( mm );
-
- spin_unlock(&mm->shadow_lock);
- }
-
- SH_VVLOG("leaving shadow_mk_pagetable( gptbase=%08lx, mode=%d ) sh=%08lx",
- pagetable_val(mm->pagetable), mm->shadow_mode,
- pagetable_val(mm->shadow_table) );
-
-}
-
-
-#if SHADOW_DEBUG
-extern int check_pagetable(struct mm_struct *m, pagetable_t pt, char *s);
-#else
-#define check_pagetable(m, pt, s) ((void)0)
-#endif
-
-
-#endif /* XEN_SHADOW_H */
-
-
+#include <asm/shadow.h>